1. Supporters’ Profile Analysis
1. What Companies/Universities are those programmers from?
company_info <- dt_user_cld %>%
group_by(company) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
filter(company != "")
# Display the top companies.
company_info
# Define the company aggregation function.
company_aggregation <- function(name) {
# Make case insensitive.
orig_name <- name
name <- toupper(name)
# Detect pattern and change the company name accordingly.
if (grepl("百度|BAIDU|AIDU", name)) {
target_name <- "Baidu"
} else if (grepl("ENCENT|腾讯|TENCENT", name)) {
target_name <- "Tencent"
} else if (grepl("LIBABA|淘宝|AOBAO|LIPAY|阿里巴巴|LIYUN|阿里云", name)) {
target_name <- "Alibaba"
} else if (grepl("JD|京东", name)) {
target_name <- "JD"
} else if (grepl("ETEASE|网易", name)) {
target_name <- "NetEase"
} else if (grepl("EITUAN|美团", name)) {
target_name <- "MeiTuan"
} else if (grepl("YTEDANCE|字节|头条", name)) {
target_name <- "ByteDance"
} else if (grepl("ELEME|饿了", name)) {
target_name <- "Eleme"
} else if (grepl("UAWEI|华为", name)) {
target_name <- "Huawei"
} else if (grepl("DIDI|滴滴|嘀嘀", name)) {
target_name <- "DiDi"
} else {
target_name <- orig_name
}
return (target_name)
}
# Define the education aggregation function.
education_aggregation <- function(name) {
# Make case insensitive
orig_name <- name
name <- toupper(name)
# Detect pattern and change the education accordingly.
if (grepl("HEJIANG|ZJU|浙江大学|浙大", name)) {
target_name <- "Zhejiang University"
} else if (grepl("SINGHUA|清华", name)) {
target_name <- "Tsinghua University"
} else if (grepl("SHANGHAI JIAO TONG|SJTU|上海交大|上海交通", name)) {
target_name <- "Shanghai Jiao Tong University"
} else if (grepl("UESTC|电子科大|电子科技", name)) {
target_name <- "University of Electronic Science and Technology of China"
} else if (grepl("USTC|中科大|中国科学技术", name)) {
target_name <- "University of Science and Technology of China"
} else if (grepl("FUDAN|复旦", name)) {
target_name <- "Fudan University"
} else if (grepl("ARBIN|哈", name)) {
target_name <- "Harbin Institute of Technology"
} else if (grepl("BUPT|北邮|北京邮电", name)) {
target_name <- "Beijing University of Post and Telecommunications"
} else {
target_name <- NA
}
return (target_name)
}
# Aggregating disparse companies.
agg_companies <- rep(NA, nrow(company_info))
agg_education <- rep(NA, nrow(company_info))
for (i in 1:nrow(company_info)) {
agg_companies[i] <- company_aggregation(company_info$company[i])
agg_education[i] <- education_aggregation(company_info$company[i])
}
company_info_agg <- cbind(company_info, agg_companies, agg_education)
# Show the top ten companies which have the most number of developer support 996.icu
company_info_agg %>% group_by(agg_companies) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
head(10)
# Show what universities are those developers from.
company_info_agg %>% group_by(agg_education) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
filter(!is.na(agg_education)) %>%
head(10)
Factor `agg_education` contains implicit NA, consider using `forcats::fct_explicit_na`
NA
As we could see from the table above, most of the identified developers (with company listed in GitHub) are coming from several major IT companies such as Tencent, Baidu, Alibaba and JD. Also, several top universities that have the most number of supporters for anti-996 movements are Zhejiang University, Beijing University of Post and Telecommunications and Harbin Institute of Technology.
2. What cities are those developers from?
#
# Define the function for aggregating the cities.
city_aggregation <- function(name) {
# Make case insensitive.
orig_name <- name
name <- toupper(name)
# Detect pattern and change the education accordingly.
if (grepl("EIJING|北京", name)) {
target_name <- "Beijing"
} else if (grepl("HANGHAI|上海", name)) {
target_name <- "Shanghai"
} else if (grepl("ANGZHOU|杭州", name)) {
target_name <- "Hangzhou"
} else if (grepl("UANGZHOU|广州", name)) {
target_name <- "Hangzhou"
} else if (grepl("HENGDU|成都", name)) {
target_name <- "Chengdu"
} else if (grepl("ANJING|南京", name)) {
target_name <- "Nanjing"
} else if (grepl("INGAPORE|新加坡", name)) {
target_name <- "Singapore"
} else if (grepl("HONG KONG|香港|HK", name)) {
target_name <- "Hong Kong"
} else if (grepl("UHAN|武汉", name)) {
target_name <- "Wuhan"
} else {
target_name <- orig_name
}
return (target_name)
}
city_info <- dt_user_cld %>%
group_by(location) %>%
summarise(count = n()) %>%
filter(location != "",
location != "China") %>%
arrange(desc(count))
agg_cities <- rep(NA, nrow(city_info))
for (i in 1:nrow(city_info)) {
agg_cities[i] <- city_aggregation(city_info$location[i])
}
city_info_agg <- cbind(city_info, agg_cities)
# Showing the top ten cities that have the most developer support 996.icu
city_info_agg %>% group_by(agg_cities) %>%
summarise(count = n()) %>%
filter(agg_cities != "",
agg_cities != "China") %>%
arrange(desc(count)) %>%
head(10)
According to the statistics in the above table, HangZhou, the headquarter of Alibaba, have the most developers who support the anti-996 movements. Also, Beijing, headquarter of Baidu and JD also have lots of supporters.
5. Distribution Plot of Supporters’ Registration Duration.
# Calculate supporters' number of days since registered the github account.
today <- lubridate::ymd("2019-04-29")
dt_user_cld <- dt_user_cld %>%
# Calculate the duration and convert it to numerical value.
mutate(duration = as.numeric(as.duration(interval(created_at, today)), "days"))
# Showing average registration years.
print(mean(dt_user_cld$duration)/365)
[1] 3.321946
# Distribution plot of registration days.
dt_user_cld %>% ggplot(aes(x = duration/365)) +
geom_histogram(col="black", fill="grey", alpha = 0.7) +
geom_vline(xintercept = mean(dt_user_cld$duration)/365, linetype = "dotted", color = "red", size = 1.5) +
labs(y = "Frequency / Count",
x = "Number of Years Since Registration") +
ggtitle("Distribution Plot of Supporters' Registration Duration")

The average time since supporters registered the github account is 3 years.
2. Statistical Modeling
1. Analyzing the Relationship Between Followers and other factors.
# Select variables for analysis.
user_stat <- dt_user_cld %>%
select(followers, following, public_repos, duration)
# Saniety Check
user_stat %>% head(10)
# Unsupervised Learning: PCA
user_pca <- prcomp(user_stat, center=TRUE, scale.=TRUE)
print(user_pca)
Standard deviations (1, .., p=4):
[1] 1.1689728 0.9810532 0.9575395 0.8684211
Rotation (n x k) = (4 x 4):
PC1 PC2 PC3 PC4
followers 0.3419542 -0.8727076 0.30951456 0.1601545
following 0.5162926 0.3258117 0.60978132 -0.5054259
public_repos 0.6008709 0.3352248 -0.09115495 0.7199092
duration 0.5054339 -0.1408987 -0.72391868 -0.4479128
summary(user_pca)
Importance of components:
PC1 PC2 PC3 PC4
Standard deviation 1.1690 0.9811 0.9575 0.8684
Proportion of Variance 0.3416 0.2406 0.2292 0.1885
Cumulative Proportion 0.3416 0.5822 0.8115 1.0000
# Supervised Learning: regression
# y = dt_user_cld$followers
# x = dt_user_cld$following, public_repos, duration
lm <- lm(followers ~ following+public_repos+duration, data = dt_user_cld)
summary(lm)
Call:
lm(formula = followers ~ following + public_repos + duration,
data = dt_user_cld)
Residuals:
Min 1Q Median 3Q Max
-1102.5 -12.8 -5.7 1.1 13756.7
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.006e+01 1.276e+00 -7.884 3.24e-15 ***
following 5.212e-02 3.965e-03 13.144 < 2e-16 ***
public_repos 6.555e-02 1.088e-02 6.023 1.73e-09 ***
duration 1.560e-02 9.397e-04 16.605 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 122.6 on 39983 degrees of freedom
Multiple R-squared: 0.01515, Adjusted R-squared: 0.01508
F-statistic: 205.1 on 3 and 39983 DF, p-value: < 2.2e-16
plot(lm)




#
Because adjusted R-squared is very low, at 0.01508. We decided to further check linear assumptions by plotting it. From the plots, we can tell that this data violated various linear assumption. Therefore, linear regression wouldn’t work with this dataset.
3. Main Questions/Issues of Supporters
1. Trending Issues.
# Showing top ten issues with most comments.
dt_issues_cld %>%
select(title, comments) %>%
arrange(desc(comments)) %>%
head(10)
# Psuedo top ten issues (translated)
psuedo_issues <- data.frame(
title = c("Discussion Thread",
"Any 'Working under 996, sicking in ICU' wallpapers to use?",
"Afterwards, I could put 'participated in an open-source project with over 2000+ stars' on my resume",
"I don't understant the law, but I'm wondering if there is any legal issue involved?",
"Can this repository be in the top-ten stars list on GitHub?",
"Substantial suggestions regarding the anti-996 movements.",
"It's ugly that the developers taking salaries while complaining about their companies.",
"Working overtime tonight, will delete the database when this repo reach over 100k stars",
"Cute girl born in 1996 is looking for developer boyfriend now.",
"Worship the original post"),
comments = c(1243, 62, 53, 39, 37, 30, 30, 26, 25, 24)) %>%
mutate(title = as.character(title))
psuedo_issues
2. Text Analysis About Topics of 996 Movement.
# Setting the word-split engine
splitter <- worker(stop_word = "data/stopwords.txt")
# Splitting the words.
seg <- c(splitter[dt_issues_cld$title], splitter[dt_issues_cld$body])
seg <- seg[nchar(seg) > 1]
# Encode the chinese word vector as UTF-8 format.
Encoding(seg) <- "UTF-8"
# Extract the top 100
seg_df <- data.frame(seg = seg) %>%
group_by(seg) %>%
summarise(freq = n()) %>%
arrange(desc(freq)) %>%
head(100)
# Generating word cloud (Chinese Version).
font_family <- par("family")
par(family = "Adobe Heiti Std R")
wordcloud(words=seg_df$seg, freq=seg_df$freq,
colors=brewer.pal(8,"Dark2"),
scale=c(4, 0.8))

# Loading translated dataset.
trans <- read.table("data/translate", sep="\t")[-1,]
seg_df <- cbind(seg_df, engl = trans$V2)
# Generating word cloud (English Version)
wordcloud(words=seg_df$engl, freq=seg_df$freq,
colors=brewer.pal(8,"Dark2"),
scale=c(4, 0.8))

As we can see from the word cloud, the size of the word represents more frequency of the word. Which indicates that in the 996 repository issues section, people are talking more about Overtime, Off work, Company, Resist, Front End and jobs etc.